samples = ['N1-neg', 'N2-neg', 'N3-neg', 'N4-neg', 'N5-neg', 'N6-neg', 'N7-neg', 'N8-neg', 'N9-neg', 'N10-neg', 'N11-neg', 'N12-neg', 'N13-neg', 'N14-neg', 'N15-neg', 'N16-neg', 'N17-neg', 'N18-neg', 'N19-neg', 'N20-neg', 'N1-pos', 'N2-pos', 'N3-pos', 'N4-pos', 'N5-pos', 'N6-pos', 'N7-pos', 'N8-pos', 'N9-pos', 'N10-pos', 'N11-pos', 'N12-pos', 'N13-pos', 'N14-pos', 'N15-pos', 'N16-pos', 'N17-pos', 'N18-pos', 'N19-pos', 'N20-pos', 'O1-neg', 'O2-neg', 'O3-neg', 'O4-neg', 'O5-neg', 'O6-neg', 'O7-neg', 'O8-neg', 'O9-neg', 'O10-neg', 'O11-neg', 'O12-neg', 'O13-neg', 'O14-neg', 'O15-neg', 'O16-neg', 'O17-neg', 'O18-neg', 'O19-neg', 'O20-neg', 'O1-pos', 'O2-pos', 'O3-pos', 'O4-pos', 'O5-pos', 'O6-pos', 'O7-pos', 'O8-pos', 'O9-pos', 'O10-pos', 'O11-pos', 'O12-pos', 'O13-pos', 'O14-pos', 'O15-pos', 'O16-pos', 'O17-pos', 'O18-pos', 'O19-pos', 'O20-pos']
def get_samples():
TNA_V4V5 = pd.read_csv('/Users/robynwright/Documents/OneDrive/Langille Lab postdoc/COVID/16S_V4V5/exports/feature-table_w_tax.txt', header=0, index_col=0, sep='\t')
TNA_V6V8 = pd.read_csv('/Users/robynwright/Documents/OneDrive/Langille Lab postdoc/COVID/16S_TNA_V6V8/exports/feature-table_w_tax.txt', header=0, index_col=0, sep='\t')
cDNA_V6V8 = pd.read_csv('/Users/robynwright/Documents/OneDrive/Langille Lab postdoc/COVID/16S_cDNA_V6V8/exports/feature-table_w_tax.txt', header=0, index_col=0, sep='\t')
TNA_V4V5_tax = TNA_V4V5.loc[:, 'taxonomy']
TNA_V4V5.drop('taxonomy', axis=1, inplace=True)
TNA_V6V8_tax = TNA_V6V8.loc[:, 'taxonomy']
TNA_V6V8.drop('taxonomy', axis=1, inplace=True)
cDNA_V6V8_tax = cDNA_V6V8.loc[:, 'taxonomy']
cDNA_V6V8.drop('taxonomy', axis=1, inplace=True)
return TNA_V4V5, TNA_V6V8, cDNA_V6V8, TNA_V4V5_tax, TNA_V6V8_tax, cDNA_V6V8_tax
TNA_V4V5, TNA_V6V8, cDNA_V6V8, TNA_V4V5_tax, TNA_V6V8_tax, cDNA_V6V8_tax = get_samples()
plt.figure(figsize=(10,10))
ax1 = plt.subplot(311)
ax2, ax3 = plt.subplot(312, sharey=ax1), plt.subplot(313, sharey=ax1)
TNA_V4V5_sum, TNA_V6V8_sum, cDNA_V6V8_sum = pd.DataFrame(TNA_V4V5.sum(axis=0)).transpose(), pd.DataFrame(TNA_V6V8.sum(axis=0)).transpose(), pd.DataFrame(cDNA_V6V8.sum(axis=0)).transpose()
x = []
for a in range(len(samples)):
if 'neg' in samples[a]:
color = '#0154B2'
else:
color = '#B20154'
if 'N' in samples[a]:
fill = color
else:
fill = 'w'
if samples[a] in TNA_V4V5_sum.columns:
ax1.bar(a, TNA_V4V5_sum.loc[:, samples[a]].values[0], color=fill, edgecolor=color)
if samples[a] in TNA_V6V8_sum.columns:
ax2.bar(a, TNA_V6V8_sum.loc[:, samples[a]].values[0], color=fill, edgecolor=color)
if samples[a] in cDNA_V6V8_sum.columns:
ax3.bar(a, cDNA_V6V8_sum.loc[:, samples[a]].values[0], color=fill, edgecolor=color)
x.append(a)
plt.sca(ax1)
plt.xticks([]), plt.semilogy(), plt.xlim([-1,x[-1]+1]), plt.ylabel('Number of reads'), plt.title('TNA V4V5')
plt.sca(ax2)
plt.xticks([]), plt.semilogy(), plt.xlim([-1,x[-1]+1]), plt.ylabel('Number of reads'), plt.title('TNA V6V8')
plt.sca(ax3)
plt.xticks(x, samples, rotation=90, fontsize=8), plt.semilogy(), plt.xlim([-1,x[-1]+1]), plt.ylabel('Number of reads'), plt.title('cDNA V6V8')
handles = [Patch(facecolor='#B20154', edgecolor='#B20154', label='Positive\nnasopharyngeal'), Patch(facecolor='w', edgecolor='#B20154', label='Positive\noropharyngeal'), Patch(facecolor='#0154B2', edgecolor='#0154B2', label='Negative\nnasopharyngeal'), Patch(facecolor='w', edgecolor='#0154B2', label='Negative\noropharyngeal')]
ax1.legend(handles=handles, loc='upper left', bbox_to_anchor=(1,1))
plt.tight_layout()
plt.show()
plt.figure(figsize=(10,3.5))
ax1 = plt.subplot(131)
ax2, ax3 = plt.subplot(132, sharey=ax1), plt.subplot(133, sharey=ax1)
ax = [ax1, ax2, ax3]
amplicons = [TNA_V4V5_sum, TNA_V6V8_sum, cDNA_V6V8_sum]
for a in range(3):
NN, NP, ON, OP = [], [], [], []
for b in range(len(samples)):
if samples[b] in amplicons[a].columns:
if 'N' in samples[b] and 'neg' in samples[b]:
NN.append(amplicons[a].loc[:, samples[b]].values[0])
elif 'N' in samples[b] and 'pos' in samples[b]:
NP.append(amplicons[a].loc[:, samples[b]].values[0])
elif 'O' in samples[b] and 'neg' in samples[b]:
ON.append(amplicons[a].loc[:, samples[b]].values[0])
elif 'O' in samples[b] and 'pos' in samples[b]:
OP.append(amplicons[a].loc[:, samples[b]].values[0])
ax[a].boxplot([NN, NP, ON, OP])
t_n, p_n = stats.ttest_ind(NN, NP)
t_o, p_o = stats.ttest_ind(ON, OP)
for c in [[p_n, NN, NP, 1, 2], [p_o, ON, OP, 3, 4]]:
x1, x2 = c[3], c[4]
ma = max(max(c[1]), max(c[2]))
y1, y2, y3 = ma*2, ma*3, ma*5
ax[a].plot([x1, x1, x2, x2], [y1, y2, y2, y1], 'k')
if c[0] < 0.05:
ax[a].text(c[3]+0.5, y3, '*', ha='center', va='center')
else:
ax[a].text(c[3]+0.5, y3, 'NS', ha='center', va='center')
plt.sca(ax[a])
if a == 0: plt.ylabel('Number of reads')
plt.semilogy()
bottom, top = plt.ylim()
plt.ylim(top=top+top*3)
plt.xticks([1, 2, 3, 4], ['Negative\nnasal', 'Positive\nnasal', 'Negative\noral', 'Positive\noral'], fontsize=8)
ax[0].set_title('TNA V4V5'), ax[1].set_title('TNA V6V8'), ax[2].set_title('cDNA V6V8')
plt.tight_layout()
plt.show()
Results of t-tests for statistical significance between negative and positive samples are indicated with asterisks.
TNA_V4V5, TNA_V6V8, cDNA_V6V8, TNA_V4V5_tax, TNA_V6V8_tax, cDNA_V6V8_tax = get_samples()
TNA_V4V5_sum, TNA_V6V8_sum, cDNA_V6V8_sum = pd.DataFrame(TNA_V4V5), pd.DataFrame(TNA_V6V8), pd.DataFrame(cDNA_V6V8)
TNA_V4V5_sum[TNA_V4V5_sum > 1] = 1
TNA_V6V8_sum[TNA_V6V8_sum > 1] = 1
cDNA_V6V8_sum[cDNA_V6V8_sum > 1] = 1
TNA_V4V5_sum, TNA_V6V8_sum, cDNA_V6V8_sum = pd.DataFrame(TNA_V4V5_sum.sum(axis=0)).transpose(), pd.DataFrame(TNA_V6V8_sum.sum(axis=0)).transpose(), pd.DataFrame(cDNA_V6V8_sum.sum(axis=0)).transpose()
plt.figure(figsize=(10,3.5))
ax1 = plt.subplot(131)
ax2, ax3 = plt.subplot(132, sharey=ax1), plt.subplot(133, sharey=ax1)
ax = [ax1, ax2, ax3]
amplicons = [TNA_V4V5_sum, TNA_V6V8_sum, cDNA_V6V8_sum]
for a in range(3):
NN, NP, ON, OP = [], [], [], []
for b in range(len(samples)):
if samples[b] in amplicons[a].columns:
if 'N' in samples[b] and 'neg' in samples[b]: NN.append(amplicons[a].loc[:, samples[b]].values[0])
elif 'N' in samples[b] and 'pos' in samples[b]: NP.append(amplicons[a].loc[:, samples[b]].values[0])
elif 'O' in samples[b] and 'neg' in samples[b]: ON.append(amplicons[a].loc[:, samples[b]].values[0])
elif 'O' in samples[b] and 'pos' in samples[b]: OP.append(amplicons[a].loc[:, samples[b]].values[0])
ax[a].boxplot([NN, NP, ON, OP])
t_n, p_n = stats.ttest_ind(NN, NP)
t_o, p_o = stats.ttest_ind(ON, OP)
for c in [[p_n, NN, NP, 1, 2, ON, OP], [p_o, ON, OP, 3, 4, NN, NP]]:
x1, x2 = c[3], c[4]
ma = max(max(c[1]), max(c[2]), max(c[5]), max(c[6]))
y1 = ma+40
y2, y3 = y1+20, y1+50
ax[a].plot([x1, x1, x2, x2], [y1, y2, y2, y1], 'k')
if c[0] < 0.05:
ax[a].text(c[3]+0.5, y3, '*', ha='center', va='center')
else:
ax[a].text(c[3]+0.5, y3, 'NS', ha='center', va='center')
plt.sca(ax[a])
if a == 0: plt.ylabel('Number of ASVs')
bottom, top = plt.ylim()
plt.ylim(top=650)
plt.xticks([1, 2, 3, 4], ['Negative\nnasal', 'Positive\nnasal', 'Negative\noral', 'Positive\noral'], fontsize=8)
ax[0].set_title('TNA V4V5'), ax[1].set_title('TNA V6V8'), ax[2].set_title('cDNA V6V8')
plt.tight_layout()
plt.show()
Results of t-tests for statistical significance between negative and positive samples are indicated with asterisks.
TNA_V4V5, TNA_V6V8, cDNA_V6V8, TNA_V4V5_tax, TNA_V6V8_tax, cDNA_V6V8_tax = get_samples()
amplicons = [TNA_V4V5, TNA_V6V8, cDNA_V6V8]
for a in range(len(amplicons)):
this_div = []
samples, asvs = list(amplicons[a].columns), list(amplicons[a].index.values)
for b in samples:
this_sample, total = [], amplicons[a].loc[:, b].sum()
for c in asvs:
this_sample.append((amplicons[a].loc[c, b]/total)**2)
this_div.append(1-sum(this_sample))
amplicons[a] = pd.DataFrame(this_div, index=samples).transpose()
plt.figure(figsize=(10,3.5))
ax1 = plt.subplot(131)
ax2, ax3 = plt.subplot(132, sharey=ax1), plt.subplot(133, sharey=ax1)
ax = [ax1, ax2, ax3]
for a in range(3):
NN, NP, ON, OP = [], [], [], []
for b in range(len(samples)):
if samples[b] in amplicons[a].columns:
if 'N' in samples[b] and 'neg' in samples[b]: NN.append(amplicons[a].loc[:, samples[b]].values[0])
elif 'N' in samples[b] and 'pos' in samples[b]: NP.append(amplicons[a].loc[:, samples[b]].values[0])
elif 'O' in samples[b] and 'neg' in samples[b]: ON.append(amplicons[a].loc[:, samples[b]].values[0])
elif 'O' in samples[b] and 'pos' in samples[b]: OP.append(amplicons[a].loc[:, samples[b]].values[0])
ax[a].boxplot([NN, NP, ON, OP])
t_n, p_n = stats.ttest_ind(NN, NP)
t_o, p_o = stats.ttest_ind(ON, OP)
for c in [[p_n, NN, NP, 1, 2, ON, OP], [p_o, ON, OP, 3, 4, NN, NP]]:
x1, x2 = c[3], c[4]
ma = max(max(c[1]), max(c[2]), max(c[5]), max(c[6]))
y1 = ma+0.1
y2, y3 = y1+0.05, y1+0.1
ax[a].plot([x1, x1, x2, x2], [y1, y2, y2, y1], 'k')
if c[0] < 0.05:
ax[a].text(c[3]+0.5, y3, '*', ha='center', va='center')
else:
ax[a].text(c[3]+0.5, y3, 'NS', ha='center', va='center')
plt.sca(ax[a])
if a == 0: plt.ylabel("Simpson's index of diversity")
bottom, top = plt.ylim()
plt.ylim(top=1.3)
plt.xticks([1, 2, 3, 4], ['Negative\nnasal', 'Positive\nnasal', 'Negative\noral', 'Positive\noral'], fontsize=8)
ax[0].set_title('TNA V4V5'), ax[1].set_title('TNA V6V8'), ax[2].set_title('cDNA V6V8')
plt.tight_layout()
plt.show()
Results of t-tests for statistical significance between negative and positive samples are indicated with asterisks.
TNA_V4V5, TNA_V6V8, cDNA_V6V8, TNA_V4V5_tax, TNA_V6V8_tax, cDNA_V6V8_tax = get_samples()
TNA_V4V5_2000 = TNA_V4V5.loc[:, TNA_V4V5.sum(axis=0) > 2000]
TNA_V6V8_2000 = TNA_V6V8.loc[:, TNA_V6V8.sum(axis=0) > 2000]
cDNA_V6V8_2000 = cDNA_V6V8.loc[:, cDNA_V6V8.sum(axis=0) > 2000]
TNA_V4V5_sum, TNA_V6V8_sum, cDNA_V6V8_sum = pd.DataFrame(TNA_V4V5_2000), pd.DataFrame(TNA_V6V8_2000), pd.DataFrame(cDNA_V6V8_2000)
TNA_V4V5_sum[TNA_V4V5_sum > 1] = 1
TNA_V6V8_sum[TNA_V6V8_sum > 1] = 1
cDNA_V6V8_sum[cDNA_V6V8_sum > 1] = 1
TNA_V4V5_sum, TNA_V6V8_sum, cDNA_V6V8_sum = pd.DataFrame(TNA_V4V5_sum.sum(axis=0)).transpose(), pd.DataFrame(TNA_V6V8_sum.sum(axis=0)).transpose(), pd.DataFrame(cDNA_V6V8_sum.sum(axis=0)).transpose()
plt.figure(figsize=(10,3.5))
ax1 = plt.subplot(131)
ax2, ax3 = plt.subplot(132, sharey=ax1), plt.subplot(133, sharey=ax1)
ax = [ax1, ax2, ax3]
amplicons = [TNA_V4V5_sum, TNA_V6V8_sum, cDNA_V6V8_sum]
for a in range(3):
NN, NP, ON, OP = [], [], [], []
for b in range(len(samples)):
if samples[b] in amplicons[a].columns:
if 'N' in samples[b] and 'neg' in samples[b]: NN.append(amplicons[a].loc[:, samples[b]].values[0])
elif 'N' in samples[b] and 'pos' in samples[b]: NP.append(amplicons[a].loc[:, samples[b]].values[0])
elif 'O' in samples[b] and 'neg' in samples[b]: ON.append(amplicons[a].loc[:, samples[b]].values[0])
elif 'O' in samples[b] and 'pos' in samples[b]: OP.append(amplicons[a].loc[:, samples[b]].values[0])
ax[a].boxplot([NN, NP, ON, OP])
t_n, p_n = stats.ttest_ind(NN, NP)
t_o, p_o = stats.ttest_ind(ON, OP)
for c in [[p_n, NN, NP, 1, 2, ON, OP], [p_o, ON, OP, 3, 4, NN, NP]]:
x1, x2 = c[3], c[4]
if len(c[2]) < 1: continue
ma = max(max(c[1]), max(c[2]), max(c[5]), max(c[6]))
y1 = ma+40
y2, y3 = y1+20, y1+50
ax[a].plot([x1, x1, x2, x2], [y1, y2, y2, y1], 'k')
if c[0] < 0.05:
ax[a].text(c[3]+0.5, y3, '*', ha='center', va='center')
else:
ax[a].text(c[3]+0.5, y3, 'NS', ha='center', va='center')
plt.sca(ax[a])
if a == 0: plt.ylabel('Number of ASVs')
bottom, top = plt.ylim()
plt.ylim(top=650)
plt.xticks([1, 2, 3, 4], ['Negative\nnasal', 'Positive\nnasal', 'Negative\noral', 'Positive\noral'], fontsize=8)
ax[0].set_title('TNA V4V5'), ax[1].set_title('TNA V6V8'), ax[2].set_title('cDNA V6V8')
plt.tight_layout()
plt.show()
Results of t-tests for statistical significance between negative and positive samples are indicated with asterisks.
TNA_V4V5, TNA_V6V8, cDNA_V6V8, TNA_V4V5_tax, TNA_V6V8_tax, cDNA_V6V8_tax = get_samples()
TNA_V4V5_2000 = TNA_V4V5.loc[:, TNA_V4V5.sum(axis=0) > 2000]
TNA_V6V8_2000 = TNA_V6V8.loc[:, TNA_V6V8.sum(axis=0) > 2000]
cDNA_V6V8_2000 = cDNA_V6V8.loc[:, cDNA_V6V8.sum(axis=0) > 2000]
amplicons = [TNA_V4V5_2000, TNA_V6V8_2000, cDNA_V6V8_2000]
for a in range(len(amplicons)):
this_div = []
samples, asvs = list(amplicons[a].columns), list(amplicons[a].index.values)
for b in samples:
this_sample, total = [], amplicons[a].loc[:, b].sum()
for c in asvs:
this_sample.append((amplicons[a].loc[c, b]/total)**2)
this_div.append(1-sum(this_sample))
amplicons[a] = pd.DataFrame(this_div, index=samples).transpose()
plt.figure(figsize=(10,3.5))
ax1 = plt.subplot(131)
ax2, ax3 = plt.subplot(132, sharey=ax1), plt.subplot(133, sharey=ax1)
ax = [ax1, ax2, ax3]
for a in range(3):
NN, NP, ON, OP = [], [], [], []
for b in range(len(samples)):
if samples[b] in amplicons[a].columns:
if 'N' in samples[b] and 'neg' in samples[b]: NN.append(amplicons[a].loc[:, samples[b]].values[0])
elif 'N' in samples[b] and 'pos' in samples[b]: NP.append(amplicons[a].loc[:, samples[b]].values[0])
elif 'O' in samples[b] and 'neg' in samples[b]: ON.append(amplicons[a].loc[:, samples[b]].values[0])
elif 'O' in samples[b] and 'pos' in samples[b]: OP.append(amplicons[a].loc[:, samples[b]].values[0])
ax[a].boxplot([NN, NP, ON, OP])
t_n, p_n = stats.ttest_ind(NN, NP)
t_o, p_o = stats.ttest_ind(ON, OP)
for c in [[p_n, NN, NP, 1, 2, ON, OP], [p_o, ON, OP, 3, 4, NN, NP]]:
x1, x2 = c[3], c[4]
if len(c[2]) < 1: continue
ma = max(max(c[1]), max(c[2]), max(c[5]), max(c[6]))
y1 = ma+0.1
y2, y3 = y1+0.05, y1+0.1
ax[a].plot([x1, x1, x2, x2], [y1, y2, y2, y1], 'k')
if c[0] < 0.05:
ax[a].text(c[3]+0.5, y3, '*', ha='center', va='center')
else:
ax[a].text(c[3]+0.5, y3, 'NS', ha='center', va='center')
plt.sca(ax[a])
if a == 0: plt.ylabel("Simpson's index of diversity")
bottom, top = plt.ylim()
plt.ylim(top=1.3)
plt.xticks([1, 2, 3, 4], ['Negative\nnasal', 'Positive\nnasal', 'Negative\noral', 'Positive\noral'], fontsize=8)
ax[0].set_title('TNA V4V5'), ax[1].set_title('TNA V6V8'), ax[2].set_title('cDNA V6V8')
plt.tight_layout()
plt.show()
TNA_V4V5, TNA_V6V8, cDNA_V6V8, TNA_V4V5_tax, TNA_V6V8_tax, cDNA_V6V8_tax = get_samples()
TNA_V4V5_2000 = TNA_V4V5.loc[:, TNA_V4V5.sum(axis=0) > 2000]
TNA_V6V8_2000 = TNA_V6V8.loc[:, TNA_V6V8.sum(axis=0) > 2000]
cDNA_V6V8_2000 = cDNA_V6V8.loc[:, cDNA_V6V8.sum(axis=0) > 2000]
asv_TNA_V4V5 <- py$TNA_V4V5
asv_TNA_V4V5 = as.matrix(asv_TNA_V4V5)
phy_tree_TNA_V4V5 <- read_tree('/Users/robynwright/Documents/OneDrive/Langille Lab postdoc/COVID/16S_V4V5/exports/tree.nwk')
ASV_TNA_V4V5 = otu_table(asv_TNA_V4V5, taxa_are_rows = TRUE)
physeq_TNA_V4V5 = phyloseq(ASV_TNA_V4V5,phy_tree_TNA_V4V5)
w_uf_TNA_V4V5 <- UniFrac(physeq_TNA_V4V5, weighted=TRUE, normalized=FALSE, fast=TRUE)
uw_uf_TNA_V4V5 <- UniFrac(physeq_TNA_V4V5, weighted=FALSE, normalized=FALSE, fast=TRUE)
w_uf_TNA_V4V5 = as.data.frame(as.matrix(w_uf_TNA_V4V5))
uw_uf_TNA_V4V5 = as.data.frame(as.matrix(uw_uf_TNA_V4V5))
asv_cDNA_V6V8 <- py$cDNA_V6V8
asv_cDNA_V6V8 = as.matrix(asv_cDNA_V6V8)
phy_tree_cDNA_V6V8 <- read_tree('/Users/robynwright/Documents/OneDrive/Langille Lab postdoc/COVID/16S_cDNA_V6V8/exports/tree.nwk')
ASV_cDNA_V6V8 = otu_table(asv_cDNA_V6V8, taxa_are_rows = TRUE)
physeq_cDNA_V6V8 = phyloseq(ASV_cDNA_V6V8,phy_tree_cDNA_V6V8)
w_uf_cDNA_V6V8 <- UniFrac(physeq_cDNA_V6V8, weighted=TRUE, normalized=FALSE, fast=TRUE)
uw_uf_cDNA_V6V8 <- UniFrac(physeq_cDNA_V6V8, weighted=FALSE, normalized=FALSE, fast=TRUE)
w_uf_cDNA_V6V8 = as.data.frame(as.matrix(w_uf_cDNA_V6V8))
uw_uf_cDNA_V6V8 = as.data.frame(as.matrix(uw_uf_cDNA_V6V8))
asv_TNA_V6V8 <- py$TNA_V6V8
asv_TNA_V6V8 = as.matrix(asv_TNA_V6V8)
phy_tree_TNA_V6V8 <- read_tree('/Users/robynwright/Documents/OneDrive/Langille Lab postdoc/COVID/16S_TNA_V6V8/exports/tree.nwk')
ASV_TNA_V6V8 = otu_table(asv_TNA_V6V8, taxa_are_rows = TRUE)
physeq_TNA_V6V8 = phyloseq(ASV_TNA_V6V8,phy_tree_TNA_V6V8)
w_uf_TNA_V6V8 <- UniFrac(physeq_TNA_V6V8, weighted=TRUE, normalized=FALSE, fast=TRUE)
uw_uf_TNA_V6V8 <- UniFrac(physeq_TNA_V6V8, weighted=FALSE, normalized=FALSE, fast=TRUE)
w_uf_TNA_V6V8 = as.data.frame(as.matrix(w_uf_TNA_V6V8))
uw_uf_TNA_V6V8 = as.data.frame(as.matrix(uw_uf_TNA_V6V8))
def transform_for_NMDS(similarities, n_jobs=1): #transform the similarity matrix to 2D space (n_components=2)
seed = 3
X_true = similarities.iloc[0:].values
mds = manifold.MDS(n_components=2, max_iter=3000, eps=1e-9, random_state=seed, dissimilarity="precomputed", n_jobs=n_jobs)
similarities = np.nan_to_num(similarities)
pos = mds.fit(similarities).embedding_
nmds = manifold.MDS(n_components=2, metric=False, max_iter=3000, eps=1e-12, dissimilarity="precomputed", random_state=seed, n_jobs=n_jobs, n_init=1)
npos = nmds.fit_transform(similarities, init=pos)
npos *= np.sqrt((X_true ** 2).sum()) / np.sqrt((npos ** 2).sum())
clf = PCA()
npos = clf.fit_transform(npos)
return npos, nmds.stress_
def plot_nmds_unifrac(w, uw, names_w, names_uw):
plt.figure(figsize=(10, 3.5))
ax1, ax2 = plt.subplot(121), plt.subplot(122)
for a in range(len(w)):
if 'neg' in names_w[a]: color = '#0154B2'
else: color = '#B20154'
if 'N' in names_w[a]: fill = color
else: fill = 'w'
ax1.scatter(w[a, 0], w[a, 1], marker='o', edgecolors=color, color=fill)
for a in range(len(uw)):
if 'neg' in names_uw[a]: color = '#0154B2'
else: color = '#B20154'
if 'N' in names_uw[a]: fill = color
else: fill = 'w'
ax1.set_title('Weighted unifrac')
ax2.set_title('Unweighted unifrac')
ax2.scatter(uw[a, 0], uw[a, 1], marker='o', edgecolors=color, color=fill)
handles = [Patch(facecolor='#B20154', edgecolor='#B20154', label='Positive\nnasopharyngeal'), Patch(facecolor='w', edgecolor='#B20154', label='Positive\noropharyngeal'), Patch(facecolor='#0154B2', edgecolor='#0154B2', label='Negative\nnasopharyngeal'), Patch(facecolor='w', edgecolor='#0154B2', label='Negative\noropharyngeal')]
ax2.legend(handles=handles, loc='upper left', bbox_to_anchor=(1,1))
ax1.set_xlabel('nMDS1'), ax1.set_ylabel('nMDS2')
ax2.set_xlabel('nMDS1'), ax2.set_ylabel('nMDS2')
w_uf_TNA_V4V5, uw_uf_TNA_V4V5, w_uf_TNA_V6V8, uw_uf_TNA_V6V8, w_uf_cDNA_V6V8, uw_uf_cDNA_V6V8 = pd.DataFrame(r.w_uf_TNA_V4V5), pd.DataFrame(r.uw_uf_TNA_V4V5), pd.DataFrame(r.w_uf_TNA_V6V8), pd.DataFrame(r.uw_uf_TNA_V6V8), pd.DataFrame(r.w_uf_cDNA_V6V8), pd.DataFrame(r.uw_uf_cDNA_V6V8)
w_npos_TNA_V4V5, stress = transform_for_NMDS(w_uf_TNA_V4V5)
uw_npos_TNA_V4V5, stress = transform_for_NMDS(uw_uf_TNA_V4V5)
plot_nmds_unifrac(w_npos_TNA_V4V5, uw_npos_TNA_V4V5, list(w_uf_TNA_V4V5.columns), list(uw_uf_TNA_V4V5.columns))
plt.tight_layout()
plt.show()
w_npos_TNA_V6V8, stress = transform_for_NMDS(w_uf_TNA_V6V8)
uw_npos_TNA_V6V8, stress = transform_for_NMDS(uw_uf_TNA_V6V8)
plot_nmds_unifrac(w_npos_TNA_V6V8, uw_npos_TNA_V6V8, list(w_uf_TNA_V6V8.columns), list(uw_uf_TNA_V6V8.columns))
plt.tight_layout()
plt.show()
w_npos_cDNA_V6V8, stress = transform_for_NMDS(w_uf_cDNA_V6V8)
uw_npos_cDNA_V6V8, stress = transform_for_NMDS(uw_uf_cDNA_V6V8)
plot_nmds_unifrac(w_npos_cDNA_V6V8, uw_npos_cDNA_V6V8, list(w_uf_cDNA_V6V8.columns), list(uw_uf_cDNA_V6V8.columns))
plt.tight_layout()
plt.show()
asv_TNA_V4V5 <- py$TNA_V4V5_2000
asv_TNA_V4V5 = as.matrix(asv_TNA_V4V5)
phy_tree_TNA_V4V5 <- read_tree('/Users/robynwright/Documents/OneDrive/Langille Lab postdoc/COVID/16S_V4V5/exports/tree.nwk')
ASV_TNA_V4V5 = otu_table(asv_TNA_V4V5, taxa_are_rows = TRUE)
physeq_TNA_V4V5 = phyloseq(ASV_TNA_V4V5,phy_tree_TNA_V4V5)
w_uf_TNA_V4V5 <- UniFrac(physeq_TNA_V4V5, weighted=TRUE, normalized=FALSE, fast=TRUE)
uw_uf_TNA_V4V5 <- UniFrac(physeq_TNA_V4V5, weighted=FALSE, normalized=FALSE, fast=TRUE)
w_uf_TNA_V4V5_2000 = as.data.frame(as.matrix(w_uf_TNA_V4V5))
uw_uf_TNA_V4V5_2000 = as.data.frame(as.matrix(uw_uf_TNA_V4V5))
asv_TNA_V6V8 <- py$TNA_V6V8_2000
asv_TNA_V6V8 = as.matrix(asv_TNA_V6V8)
phy_tree_TNA_V6V8 <- read_tree('/Users/robynwright/Documents/OneDrive/Langille Lab postdoc/COVID/16S_TNA_V6V8/exports/tree.nwk')
ASV_TNA_V6V8 = otu_table(asv_TNA_V6V8, taxa_are_rows = TRUE)
physeq_TNA_V6V8 = phyloseq(ASV_TNA_V6V8,phy_tree_TNA_V6V8)
w_uf_TNA_V6V8 <- UniFrac(physeq_TNA_V6V8, weighted=TRUE, normalized=FALSE, fast=TRUE)
uw_uf_TNA_V6V8 <- UniFrac(physeq_TNA_V6V8, weighted=FALSE, normalized=FALSE, fast=TRUE)
w_uf_TNA_V6V8_2000 = as.data.frame(as.matrix(w_uf_TNA_V6V8))
uw_uf_TNA_V6V8_2000 = as.data.frame(as.matrix(uw_uf_TNA_V6V8))
asv_cDNA_V6V8 <- py$cDNA_V6V8_2000
asv_cDNA_V6V8 = as.matrix(asv_cDNA_V6V8)
phy_tree_cDNA_V6V8 <- read_tree('/Users/robynwright/Documents/OneDrive/Langille Lab postdoc/COVID/16S_cDNA_V6V8/exports/tree.nwk')
ASV_cDNA_V6V8 = otu_table(asv_cDNA_V6V8, taxa_are_rows = TRUE)
physeq_cDNA_V6V8 = phyloseq(ASV_cDNA_V6V8,phy_tree_cDNA_V6V8)
w_uf_cDNA_V6V8 <- UniFrac(physeq_cDNA_V6V8, weighted=TRUE, normalized=FALSE, fast=TRUE)
uw_uf_cDNA_V6V8 <- UniFrac(physeq_cDNA_V6V8, weighted=FALSE, normalized=FALSE, fast=TRUE)
w_uf_cDNA_V6V8_2000 = as.data.frame(as.matrix(w_uf_cDNA_V6V8))
uw_uf_cDNA_V6V8_2000 = as.data.frame(as.matrix(uw_uf_cDNA_V6V8))
w_uf_TNA_V4V5_2000, uw_uf_TNA_V4V5_2000, w_uf_TNA_V6V8_2000, uw_uf_TNA_V6V8_2000, w_uf_cDNA_V6V8_2000, uw_uf_cDNA_V6V8_2000 = pd.DataFrame(r.w_uf_TNA_V4V5_2000), pd.DataFrame(r.uw_uf_TNA_V4V5_2000), pd.DataFrame(r.w_uf_TNA_V6V8_2000), pd.DataFrame(r.uw_uf_TNA_V6V8_2000), pd.DataFrame(r.w_uf_cDNA_V6V8_2000), pd.DataFrame(r.uw_uf_cDNA_V6V8_2000)
w_npos_TNA_V4V5_2000, stress = transform_for_NMDS(w_uf_TNA_V4V5_2000)
uw_npos_TNA_V4V5_2000, stress = transform_for_NMDS(uw_uf_TNA_V4V5_2000)
plot_nmds_unifrac(w_npos_TNA_V4V5_2000, uw_npos_TNA_V4V5_2000, list(w_uf_TNA_V4V5_2000.columns), list(uw_uf_TNA_V4V5_2000.columns))
plt.tight_layout()
plt.show()
w_npos_TNA_V6V8_2000, stress = transform_for_NMDS(w_uf_TNA_V6V8_2000)
uw_npos_TNA_V6V8_2000, stress = transform_for_NMDS(uw_uf_TNA_V6V8_2000)
plot_nmds_unifrac(w_npos_TNA_V6V8_2000, uw_npos_TNA_V6V8_2000, list(w_uf_TNA_V6V8_2000.columns), list(uw_uf_TNA_V6V8_2000.columns))
plt.tight_layout()
plt.show()
w_npos_cDNA_V6V8_2000, stress = transform_for_NMDS(w_uf_cDNA_V6V8_2000)
uw_npos_cDNA_V6V8_2000, stress = transform_for_NMDS(uw_uf_cDNA_V6V8_2000)
plot_nmds_unifrac(w_npos_cDNA_V6V8_2000, uw_npos_cDNA_V6V8_2000, list(w_uf_cDNA_V6V8_2000.columns), list(uw_uf_cDNA_V6V8_2000.columns))
plt.tight_layout()
plt.show()
TNA_V4V5, TNA_V6V8, cDNA_V6V8, TNA_V4V5_tax, TNA_V6V8_tax, cDNA_V6V8_tax = get_samples()
def get_tax_dict(tax):
asv = list(tax.index.values)
tax_dict = {}
tax = pd.DataFrame(tax)
for a in asv:
taxonomy = tax.loc[a, 'taxonomy']
taxonomy = taxonomy.split(';')
for b in range(len(taxonomy)):
try:
taxonomy[b] = taxonomy[b].split('__')[1]
taxonomy[b] = taxonomy[b].replace('_', ' ')
taxonomy[b] = taxonomy[b].replace('Allorhizobium-Neorhizobium-Pararhizobium-Rhizobium', 'Rhizobium')
taxonomy[b] = taxonomy[b].replace('Burkholderia-Caballeronia-Paraburkholderia', 'Burkholderia')
except:
taxonomy[b] = taxonomy[b].replace('_', ' ')
tax_dict[a] = taxonomy
return tax_dict
TNA_V4V5_tax_dict = get_tax_dict(TNA_V4V5_tax)
TNA_V6V8_tax_dict = get_tax_dict(TNA_V6V8_tax)
cDNA_V6V8_tax_dict = get_tax_dict(cDNA_V6V8_tax)
def group_to_level(feat_table, tax_dict, level):
rename_dict = {}
asv = list(feat_table.index.values)
for a in asv:
try:
rename_dict[a] = tax_dict[a][level]
except:
rename_dict[a] = tax_dict[a][-1]
this_feat_table = feat_table.rename(index=rename_dict)
return this_feat_table
def get_colors():
colormap_40b, colormap_40c = mpl.cm.get_cmap('tab20b', 256), mpl.cm.get_cmap('tab20c', 256)
norm, norm2 = mpl.colors.Normalize(vmin=0, vmax=19), mpl.colors.Normalize(vmin=20, vmax=39)
m2, m3 = mpl.cm.ScalarMappable(norm=norm, cmap=colormap_40b), mpl.cm.ScalarMappable(norm=norm2, cmap=colormap_40c)
colors_40 = [m2.to_rgba(a) for a in range(20)]+[m3.to_rgba(a) for a in range(20,40)]
return colors_40+colors_40+colors_40+colors_40
def get_bar(v1, v2, v3, level):
TNA_V4V5_level = group_to_level(v1, TNA_V4V5_tax_dict, level)
TNA_V4V5_level = TNA_V4V5_level.groupby(by=TNA_V4V5_level.index, axis=0).sum()
TNA_V4V5_level = TNA_V4V5_level.divide(TNA_V4V5_level.sum(axis=0))*100
TNA_V4V5_level = TNA_V4V5_level.loc[TNA_V4V5_level.max(axis=1) > 1, :]
TNA_V6V8_level = group_to_level(v2, TNA_V6V8_tax_dict, level)
TNA_V6V8_level = TNA_V6V8_level.groupby(by=TNA_V6V8_level.index, axis=0).sum()
TNA_V6V8_level = TNA_V6V8_level.divide(TNA_V6V8_level.sum(axis=0))*100
TNA_V6V8_level = TNA_V6V8_level.loc[TNA_V6V8_level.max(axis=1) > 1, :]
cDNA_V6V8_level = group_to_level(v3, cDNA_V6V8_tax_dict, level)
cDNA_V6V8_level = cDNA_V6V8_level.groupby(by=cDNA_V6V8_level.index, axis=0).sum()
cDNA_V6V8_level = cDNA_V6V8_level.divide(cDNA_V6V8_level.sum(axis=0))*100
cDNA_V6V8_level = cDNA_V6V8_level.loc[cDNA_V6V8_level.max(axis=1) > 1, :]
all_taxa = list(TNA_V4V5_level.index.values)+list(TNA_V6V8_level.index.values)+list(cDNA_V6V8_level.index.values)
unique_taxa = []
for tax in all_taxa:
if tax not in unique_taxa: unique_taxa.append(tax)
unique_taxa = sorted(unique_taxa)
handles = []
tax_colors = get_colors()
color_dict = {}
for a in range (len(unique_taxa)):
color_dict[unique_taxa[a]] = tax_colors[a]
handles.append(Patch(facecolor=tax_colors[a], label=unique_taxa[a]))
plt.figure(figsize=(10,10))
ax1, ax2, ax3 = plt.subplot(311), plt.subplot(312), plt.subplot(313)
amplicons, axis = [TNA_V4V5_level, TNA_V6V8_level, cDNA_V6V8_level], [ax1, ax2, ax3]
x, xlab = [], []
for a in range(len(samples)):
x.append(a), xlab.append('')
for b in range(len(amplicons)):
bottom = 0
if samples[a] not in amplicons[b].columns: continue
taxa_amp = list(amplicons[b].index.values)
for c in range(len(taxa_amp)):
val = amplicons[b].loc[taxa_amp[c], samples[a]]
axis[b].bar(a, val, bottom=bottom, width=1, color=color_dict[taxa_amp[c]], edgecolor='k')
bottom += val
for z in range(len(axis)):
axis[z].set_xlim([-1, 80]), axis[z].set_ylabel('Relative abundance (%)')
plt.sca(ax1)
plt.xticks(x, xlab)
plt.sca(ax2)
plt.xticks(x, xlab)
plt.sca(ax3)
plt.xticks(x, samples, rotation=90, fontsize=8)
ax1.set_title('TNA V4V5'), ax2.set_title('TNA V6V8'), ax3.set_title('cDNA V6V8')
nc = math.ceil(len(unique_taxa)/50)
ax1.legend(handles=handles, loc='upper left', bbox_to_anchor=(1,1.05), fontsize=8)
return
samples = ['N1-neg', 'N2-neg', 'N3-neg', 'N4-neg', 'N5-neg', 'N6-neg', 'N7-neg', 'N8-neg', 'N9-neg', 'N10-neg', 'N11-neg', 'N12-neg', 'N13-neg', 'N14-neg', 'N15-neg', 'N16-neg', 'N17-neg', 'N18-neg', 'N19-neg', 'N20-neg', 'N1-pos', 'N2-pos', 'N3-pos', 'N4-pos', 'N5-pos', 'N6-pos', 'N7-pos', 'N8-pos', 'N9-pos', 'N10-pos', 'N11-pos', 'N12-pos', 'N13-pos', 'N14-pos', 'N15-pos', 'N16-pos', 'N17-pos', 'N18-pos', 'N19-pos', 'N20-pos', 'O1-neg', 'O2-neg', 'O3-neg', 'O4-neg', 'O5-neg', 'O6-neg', 'O7-neg', 'O8-neg', 'O9-neg', 'O10-neg', 'O11-neg', 'O12-neg', 'O13-neg', 'O14-neg', 'O15-neg', 'O16-neg', 'O17-neg', 'O18-neg', 'O19-neg', 'O20-neg', 'O1-pos', 'O2-pos', 'O3-pos', 'O4-pos', 'O5-pos', 'O6-pos', 'O7-pos', 'O8-pos', 'O9-pos', 'O10-pos', 'O11-pos', 'O12-pos', 'O13-pos', 'O14-pos', 'O15-pos', 'O16-pos', 'O17-pos', 'O18-pos', 'O19-pos', 'O20-pos']
dir = '/Users/robynwright/Documents/OneDrive/Langille Lab postdoc/COVID/amplicon_analysis/figures/'
get_bar(TNA_V4V5, TNA_V6V8, cDNA_V6V8, 1)
plt.savefig(dir+'Bar_all_samples_phylum.png', bbox_inches='tight', dpi=600)
get_bar(TNA_V4V5, TNA_V6V8, cDNA_V6V8, 2)
plt.savefig(dir+'Bar_all_samples_class.png', bbox_inches='tight', dpi=600)
get_bar(TNA_V4V5, TNA_V6V8, cDNA_V6V8, 3)
plt.savefig(dir+'Bar_all_samples_order.png', bbox_inches='tight', dpi=600)
get_bar(TNA_V4V5, TNA_V6V8, cDNA_V6V8, 4)
plt.savefig(dir+'Bar_all_samples_family.png', bbox_inches='tight', dpi=600)
get_bar(TNA_V4V5, TNA_V6V8, cDNA_V6V8, 5)
plt.savefig(dir+'Bar_all_samples_genus.png', bbox_inches='tight', dpi=600)
TNA_V4V5_2000 = TNA_V4V5.loc[:, TNA_V4V5.sum(axis=0) > 2000]
TNA_V6V8_2000 = TNA_V6V8.loc[:, TNA_V6V8.sum(axis=0) > 2000]
cDNA_V6V8_2000 = cDNA_V6V8.loc[:, cDNA_V6V8.sum(axis=0) > 2000]
get_bar(TNA_V4V5_2000, TNA_V6V8_2000, cDNA_V6V8_2000, 1)
plt.savefig(dir+'Bar_samples_ab2000_phylum.png', bbox_inches='tight', dpi=600)
get_bar(TNA_V4V5_2000, TNA_V6V8_2000, cDNA_V6V8_2000, 2)
plt.savefig(dir+'Bar_samples_ab2000_class.png', bbox_inches='tight', dpi=600)